default rel
%define XMMWORD
%define YMMWORD
%define ZMMWORD
EXTERN  OPENSSL_ia32cap_P
global  ossl_rsaz_avx512ifma_eligible

ALIGN   32
ossl_rsaz_avx512ifma_eligible:
        mov     ecx,DWORD[((OPENSSL_ia32cap_P+8))]
        xor     eax,eax
        and     ecx,2149777408
        cmp     ecx,2149777408
        cmove   eax,ecx
        DB      0F3h,0C3h               ;repret

section .text code align=64


global  ossl_rsaz_amm52x20_x1_256

ALIGN   32
ossl_rsaz_amm52x20_x1_256:
        mov     QWORD[8+rsp],rdi        ;WIN64 prologue
        mov     QWORD[16+rsp],rsi
        mov     rax,rsp
$L$SEH_begin_ossl_rsaz_amm52x20_x1_256:
        mov     rdi,rcx
        mov     rsi,rdx
        mov     rdx,r8
        mov     rcx,r9
        mov     r8,QWORD[40+rsp]



DB      243,15,30,250
        push    rbx

        push    rbp

        push    r12

        push    r13

        push    r14

        push    r15

$L$rsaz_amm52x20_x1_256_body:


        vpxord  ymm0,ymm0,ymm0
        vmovdqa64       ymm1,ymm0
        vmovdqa64       ymm16,ymm0
        vmovdqa64       ymm17,ymm0
        vmovdqa64       ymm18,ymm0
        vmovdqa64       ymm19,ymm0

        xor     r9d,r9d

        mov     r11,rdx
        mov     rax,0xfffffffffffff


        mov     ebx,5

ALIGN   32
$L$loop5:
        mov     r13,QWORD[r11]

        vpbroadcastq    ymm3,r13
        mov     rdx,QWORD[rsi]
        mulx    r12,r13,r13
        add     r9,r13
        mov     r10,r12
        adc     r10,0

        mov     r13,r8
        imul    r13,r9
        and     r13,rax

        vpbroadcastq    ymm4,r13
        mov     rdx,QWORD[rcx]
        mulx    r12,r13,r13
        add     r9,r13
        adc     r10,r12

        shr     r9,52
        sal     r10,12
        or      r9,r10

        vpmadd52luq     ymm1,ymm3,YMMWORD[rsi]
        vpmadd52luq     ymm16,ymm3,YMMWORD[32+rsi]
        vpmadd52luq     ymm17,ymm3,YMMWORD[64+rsi]
        vpmadd52luq     ymm18,ymm3,YMMWORD[96+rsi]
        vpmadd52luq     ymm19,ymm3,YMMWORD[128+rsi]

        vpmadd52luq     ymm1,ymm4,YMMWORD[rcx]
        vpmadd52luq     ymm16,ymm4,YMMWORD[32+rcx]
        vpmadd52luq     ymm17,ymm4,YMMWORD[64+rcx]
        vpmadd52luq     ymm18,ymm4,YMMWORD[96+rcx]
        vpmadd52luq     ymm19,ymm4,YMMWORD[128+rcx]


        valignq ymm1,ymm16,ymm1,1
        valignq ymm16,ymm17,ymm16,1
        valignq ymm17,ymm18,ymm17,1
        valignq ymm18,ymm19,ymm18,1
        valignq ymm19,ymm0,ymm19,1

        vmovq   r13,xmm1
        add     r9,r13

        vpmadd52huq     ymm1,ymm3,YMMWORD[rsi]
        vpmadd52huq     ymm16,ymm3,YMMWORD[32+rsi]
        vpmadd52huq     ymm17,ymm3,YMMWORD[64+rsi]
        vpmadd52huq     ymm18,ymm3,YMMWORD[96+rsi]
        vpmadd52huq     ymm19,ymm3,YMMWORD[128+rsi]

        vpmadd52huq     ymm1,ymm4,YMMWORD[rcx]
        vpmadd52huq     ymm16,ymm4,YMMWORD[32+rcx]
        vpmadd52huq     ymm17,ymm4,YMMWORD[64+rcx]
        vpmadd52huq     ymm18,ymm4,YMMWORD[96+rcx]
        vpmadd52huq     ymm19,ymm4,YMMWORD[128+rcx]
        mov     r13,QWORD[8+r11]

        vpbroadcastq    ymm3,r13
        mov     rdx,QWORD[rsi]
        mulx    r12,r13,r13
        add     r9,r13
        mov     r10,r12
        adc     r10,0

        mov     r13,r8
        imul    r13,r9
        and     r13,rax

        vpbroadcastq    ymm4,r13
        mov     rdx,QWORD[rcx]
        mulx    r12,r13,r13
        add     r9,r13
        adc     r10,r12

        shr     r9,52
        sal     r10,12
        or      r9,r10

        vpmadd52luq     ymm1,ymm3,YMMWORD[rsi]
        vpmadd52luq     ymm16,ymm3,YMMWORD[32+rsi]
        vpmadd52luq     ymm17,ymm3,YMMWORD[64+rsi]
        vpmadd52luq     ymm18,ymm3,YMMWORD[96+rsi]
        vpmadd52luq     ymm19,ymm3,YMMWORD[128+rsi]

        vpmadd52luq     ymm1,ymm4,YMMWORD[rcx]
        vpmadd52luq     ymm16,ymm4,YMMWORD[32+rcx]
        vpmadd52luq     ymm17,ymm4,YMMWORD[64+rcx]
        vpmadd52luq     ymm18,ymm4,YMMWORD[96+rcx]
        vpmadd52luq     ymm19,ymm4,YMMWORD[128+rcx]


        valignq ymm1,ymm16,ymm1,1
        valignq ymm16,ymm17,ymm16,1
        valignq ymm17,ymm18,ymm17,1
        valignq ymm18,ymm19,ymm18,1
        valignq ymm19,ymm0,ymm19,1

        vmovq   r13,xmm1
        add     r9,r13

        vpmadd52huq     ymm1,ymm3,YMMWORD[rsi]
        vpmadd52huq     ymm16,ymm3,YMMWORD[32+rsi]
        vpmadd52huq     ymm17,ymm3,YMMWORD[64+rsi]
        vpmadd52huq     ymm18,ymm3,YMMWORD[96+rsi]
        vpmadd52huq     ymm19,ymm3,YMMWORD[128+rsi]

        vpmadd52huq     ymm1,ymm4,YMMWORD[rcx]
        vpmadd52huq     ymm16,ymm4,YMMWORD[32+rcx]
        vpmadd52huq     ymm17,ymm4,YMMWORD[64+rcx]
        vpmadd52huq     ymm18,ymm4,YMMWORD[96+rcx]
        vpmadd52huq     ymm19,ymm4,YMMWORD[128+rcx]
        mov     r13,QWORD[16+r11]

        vpbroadcastq    ymm3,r13
        mov     rdx,QWORD[rsi]
        mulx    r12,r13,r13
        add     r9,r13
        mov     r10,r12
        adc     r10,0

        mov     r13,r8
        imul    r13,r9
        and     r13,rax

        vpbroadcastq    ymm4,r13
        mov     rdx,QWORD[rcx]
        mulx    r12,r13,r13
        add     r9,r13
        adc     r10,r12

        shr     r9,52
        sal     r10,12
        or      r9,r10

        vpmadd52luq     ymm1,ymm3,YMMWORD[rsi]
        vpmadd52luq     ymm16,ymm3,YMMWORD[32+rsi]
        vpmadd52luq     ymm17,ymm3,YMMWORD[64+rsi]
        vpmadd52luq     ymm18,ymm3,YMMWORD[96+rsi]
        vpmadd52luq     ymm19,ymm3,YMMWORD[128+rsi]

        vpmadd52luq     ymm1,ymm4,YMMWORD[rcx]
        vpmadd52luq     ymm16,ymm4,YMMWORD[32+rcx]
        vpmadd52luq     ymm17,ymm4,YMMWORD[64+rcx]
        vpmadd52luq     ymm18,ymm4,YMMWORD[96+rcx]
        vpmadd52luq     ymm19,ymm4,YMMWORD[128+rcx]


        valignq ymm1,ymm16,ymm1,1
        valignq ymm16,ymm17,ymm16,1
        valignq ymm17,ymm18,ymm17,1
        valignq ymm18,ymm19,ymm18,1
        valignq ymm19,ymm0,ymm19,1

        vmovq   r13,xmm1
        add     r9,r13

        vpmadd52huq     ymm1,ymm3,YMMWORD[rsi]
        vpmadd52huq     ymm16,ymm3,YMMWORD[32+rsi]
        vpmadd52huq     ymm17,ymm3,YMMWORD[64+rsi]
        vpmadd52huq     ymm18,ymm3,YMMWORD[96+rsi]
        vpmadd52huq     ymm19,ymm3,YMMWORD[128+rsi]

        vpmadd52huq     ymm1,ymm4,YMMWORD[rcx]
        vpmadd52huq     ymm16,ymm4,YMMWORD[32+rcx]
        vpmadd52huq     ymm17,ymm4,YMMWORD[64+rcx]
        vpmadd52huq     ymm18,ymm4,YMMWORD[96+rcx]
        vpmadd52huq     ymm19,ymm4,YMMWORD[128+rcx]
        mov     r13,QWORD[24+r11]

        vpbroadcastq    ymm3,r13
        mov     rdx,QWORD[rsi]
        mulx    r12,r13,r13
        add     r9,r13
        mov     r10,r12
        adc     r10,0

        mov     r13,r8
        imul    r13,r9
        and     r13,rax

        vpbroadcastq    ymm4,r13
        mov     rdx,QWORD[rcx]
        mulx    r12,r13,r13
        add     r9,r13
        adc     r10,r12

        shr     r9,52
        sal     r10,12
        or      r9,r10

        vpmadd52luq     ymm1,ymm3,YMMWORD[rsi]
        vpmadd52luq     ymm16,ymm3,YMMWORD[32+rsi]
        vpmadd52luq     ymm17,ymm3,YMMWORD[64+rsi]
        vpmadd52luq     ymm18,ymm3,YMMWORD[96+rsi]
        vpmadd52luq     ymm19,ymm3,YMMWORD[128+rsi]

        vpmadd52luq     ymm1,ymm4,YMMWORD[rcx]
        vpmadd52luq     ymm16,ymm4,YMMWORD[32+rcx]
        vpmadd52luq     ymm17,ymm4,YMMWORD[64+rcx]
        vpmadd52luq     ymm18,ymm4,YMMWORD[96+rcx]
        vpmadd52luq     ymm19,ymm4,YMMWORD[128+rcx]


        valignq ymm1,ymm16,ymm1,1
        valignq ymm16,ymm17,ymm16,1
        valignq ymm17,ymm18,ymm17,1
        valignq ymm18,ymm19,ymm18,1
        valignq ymm19,ymm0,ymm19,1

        vmovq   r13,xmm1
        add     r9,r13

        vpmadd52huq     ymm1,ymm3,YMMWORD[rsi]
        vpmadd52huq     ymm16,ymm3,YMMWORD[32+rsi]
        vpmadd52huq     ymm17,ymm3,YMMWORD[64+rsi]
        vpmadd52huq     ymm18,ymm3,YMMWORD[96+rsi]
        vpmadd52huq     ymm19,ymm3,YMMWORD[128+rsi]

        vpmadd52huq     ymm1,ymm4,YMMWORD[rcx]
        vpmadd52huq     ymm16,ymm4,YMMWORD[32+rcx]
        vpmadd52huq     ymm17,ymm4,YMMWORD[64+rcx]
        vpmadd52huq     ymm18,ymm4,YMMWORD[96+rcx]
        vpmadd52huq     ymm19,ymm4,YMMWORD[128+rcx]
        lea     r11,[32+r11]
        dec     ebx
        jne     NEAR $L$loop5

        vmovdqa64       ymm4,YMMWORD[$L$mask52x4]

        vpbroadcastq    ymm3,r9
        vpblendd        ymm1,ymm1,ymm3,3



        vpsrlq  ymm24,ymm1,52
        vpsrlq  ymm25,ymm16,52
        vpsrlq  ymm26,ymm17,52
        vpsrlq  ymm27,ymm18,52
        vpsrlq  ymm28,ymm19,52


        valignq ymm28,ymm28,ymm27,3
        valignq ymm27,ymm27,ymm26,3
        valignq ymm26,ymm26,ymm25,3
        valignq ymm25,ymm25,ymm24,3
        valignq ymm24,ymm24,ymm0,3


        vpandq  ymm1,ymm1,ymm4
        vpandq  ymm16,ymm16,ymm4
        vpandq  ymm17,ymm17,ymm4
        vpandq  ymm18,ymm18,ymm4
        vpandq  ymm19,ymm19,ymm4


        vpaddq  ymm1,ymm1,ymm24
        vpaddq  ymm16,ymm16,ymm25
        vpaddq  ymm17,ymm17,ymm26
        vpaddq  ymm18,ymm18,ymm27
        vpaddq  ymm19,ymm19,ymm28



        vpcmpuq k1,ymm4,ymm1,1
        vpcmpuq k2,ymm4,ymm16,1
        vpcmpuq k3,ymm4,ymm17,1
        vpcmpuq k4,ymm4,ymm18,1
        vpcmpuq k5,ymm4,ymm19,1
        kmovb   r14d,k1
        kmovb   r13d,k2
        kmovb   r12d,k3
        kmovb   r11d,k4
        kmovb   r10d,k5


        vpcmpuq k1,ymm4,ymm1,0
        vpcmpuq k2,ymm4,ymm16,0
        vpcmpuq k3,ymm4,ymm17,0
        vpcmpuq k4,ymm4,ymm18,0
        vpcmpuq k5,ymm4,ymm19,0
        kmovb   r9d,k1
        kmovb   r8d,k2
        kmovb   ebx,k3
        kmovb   ecx,k4
        kmovb   edx,k5



        shl     r13b,4
        or      r14b,r13b
        shl     r11b,4
        or      r12b,r11b

        add     r14b,r14b
        adc     r12b,r12b
        adc     r10b,r10b

        shl     r8b,4
        or      r9b,r8b
        shl     cl,4
        or      bl,cl

        add     r14b,r9b
        adc     r12b,bl
        adc     r10b,dl

        xor     r14b,r9b
        xor     r12b,bl
        xor     r10b,dl

        kmovb   k1,r14d
        shr     r14b,4
        kmovb   k2,r14d
        kmovb   k3,r12d
        shr     r12b,4
        kmovb   k4,r12d
        kmovb   k5,r10d


        vpsubq  ymm1{k1},ymm1,ymm4
        vpsubq  ymm16{k2},ymm16,ymm4
        vpsubq  ymm17{k3},ymm17,ymm4
        vpsubq  ymm18{k4},ymm18,ymm4
        vpsubq  ymm19{k5},ymm19,ymm4

        vpandq  ymm1,ymm1,ymm4
        vpandq  ymm16,ymm16,ymm4
        vpandq  ymm17,ymm17,ymm4
        vpandq  ymm18,ymm18,ymm4
        vpandq  ymm19,ymm19,ymm4

        vmovdqu64       YMMWORD[rdi],ymm1
        vmovdqu64       YMMWORD[32+rdi],ymm16
        vmovdqu64       YMMWORD[64+rdi],ymm17
        vmovdqu64       YMMWORD[96+rdi],ymm18
        vmovdqu64       YMMWORD[128+rdi],ymm19

        vzeroupper
        mov     r15,QWORD[rsp]

        mov     r14,QWORD[8+rsp]

        mov     r13,QWORD[16+rsp]

        mov     r12,QWORD[24+rsp]

        mov     rbp,QWORD[32+rsp]

        mov     rbx,QWORD[40+rsp]

        lea     rsp,[48+rsp]

$L$rsaz_amm52x20_x1_256_epilogue:
        mov     rdi,QWORD[8+rsp]        ;WIN64 epilogue
        mov     rsi,QWORD[16+rsp]
        DB      0F3h,0C3h               ;repret

$L$SEH_end_ossl_rsaz_amm52x20_x1_256:
section .data data align=8

ALIGN   32
$L$mask52x4:
        DQ      0xfffffffffffff
        DQ      0xfffffffffffff
        DQ      0xfffffffffffff
        DQ      0xfffffffffffff
section .text code align=64


global  ossl_rsaz_amm52x20_x2_256

ALIGN   32
ossl_rsaz_amm52x20_x2_256:
        mov     QWORD[8+rsp],rdi        ;WIN64 prologue
        mov     QWORD[16+rsp],rsi
        mov     rax,rsp
$L$SEH_begin_ossl_rsaz_amm52x20_x2_256:
        mov     rdi,rcx
        mov     rsi,rdx
        mov     rdx,r8
        mov     rcx,r9
        mov     r8,QWORD[40+rsp]



DB      243,15,30,250
        push    rbx

        push    rbp

        push    r12

        push    r13

        push    r14

        push    r15

$L$rsaz_amm52x20_x2_256_body:


        vpxord  ymm0,ymm0,ymm0
        vmovdqa64       ymm1,ymm0
        vmovdqa64       ymm16,ymm0
        vmovdqa64       ymm17,ymm0
        vmovdqa64       ymm18,ymm0
        vmovdqa64       ymm19,ymm0
        vmovdqa64       ymm2,ymm0
        vmovdqa64       ymm20,ymm0
        vmovdqa64       ymm21,ymm0
        vmovdqa64       ymm22,ymm0
        vmovdqa64       ymm23,ymm0

        xor     r9d,r9d
        xor     r15d,r15d

        mov     r11,rdx
        mov     rax,0xfffffffffffff

        mov     ebx,20

ALIGN   32
$L$loop20:
        mov     r13,QWORD[r11]

        vpbroadcastq    ymm3,r13
        mov     rdx,QWORD[rsi]
        mulx    r12,r13,r13
        add     r9,r13
        mov     r10,r12
        adc     r10,0

        mov     r13,QWORD[r8]
        imul    r13,r9
        and     r13,rax

        vpbroadcastq    ymm4,r13
        mov     rdx,QWORD[rcx]
        mulx    r12,r13,r13
        add     r9,r13
        adc     r10,r12

        shr     r9,52
        sal     r10,12
        or      r9,r10

        vpmadd52luq     ymm1,ymm3,YMMWORD[rsi]
        vpmadd52luq     ymm16,ymm3,YMMWORD[32+rsi]
        vpmadd52luq     ymm17,ymm3,YMMWORD[64+rsi]
        vpmadd52luq     ymm18,ymm3,YMMWORD[96+rsi]
        vpmadd52luq     ymm19,ymm3,YMMWORD[128+rsi]

        vpmadd52luq     ymm1,ymm4,YMMWORD[rcx]
        vpmadd52luq     ymm16,ymm4,YMMWORD[32+rcx]
        vpmadd52luq     ymm17,ymm4,YMMWORD[64+rcx]
        vpmadd52luq     ymm18,ymm4,YMMWORD[96+rcx]
        vpmadd52luq     ymm19,ymm4,YMMWORD[128+rcx]


        valignq ymm1,ymm16,ymm1,1
        valignq ymm16,ymm17,ymm16,1
        valignq ymm17,ymm18,ymm17,1
        valignq ymm18,ymm19,ymm18,1
        valignq ymm19,ymm0,ymm19,1

        vmovq   r13,xmm1
        add     r9,r13

        vpmadd52huq     ymm1,ymm3,YMMWORD[rsi]
        vpmadd52huq     ymm16,ymm3,YMMWORD[32+rsi]
        vpmadd52huq     ymm17,ymm3,YMMWORD[64+rsi]
        vpmadd52huq     ymm18,ymm3,YMMWORD[96+rsi]
        vpmadd52huq     ymm19,ymm3,YMMWORD[128+rsi]

        vpmadd52huq     ymm1,ymm4,YMMWORD[rcx]
        vpmadd52huq     ymm16,ymm4,YMMWORD[32+rcx]
        vpmadd52huq     ymm17,ymm4,YMMWORD[64+rcx]
        vpmadd52huq     ymm18,ymm4,YMMWORD[96+rcx]
        vpmadd52huq     ymm19,ymm4,YMMWORD[128+rcx]
        mov     r13,QWORD[160+r11]

        vpbroadcastq    ymm3,r13
        mov     rdx,QWORD[160+rsi]
        mulx    r12,r13,r13
        add     r15,r13
        mov     r10,r12
        adc     r10,0

        mov     r13,QWORD[8+r8]
        imul    r13,r15
        and     r13,rax

        vpbroadcastq    ymm4,r13
        mov     rdx,QWORD[160+rcx]
        mulx    r12,r13,r13
        add     r15,r13
        adc     r10,r12

        shr     r15,52
        sal     r10,12
        or      r15,r10

        vpmadd52luq     ymm2,ymm3,YMMWORD[160+rsi]
        vpmadd52luq     ymm20,ymm3,YMMWORD[192+rsi]
        vpmadd52luq     ymm21,ymm3,YMMWORD[224+rsi]
        vpmadd52luq     ymm22,ymm3,YMMWORD[256+rsi]
        vpmadd52luq     ymm23,ymm3,YMMWORD[288+rsi]

        vpmadd52luq     ymm2,ymm4,YMMWORD[160+rcx]
        vpmadd52luq     ymm20,ymm4,YMMWORD[192+rcx]
        vpmadd52luq     ymm21,ymm4,YMMWORD[224+rcx]
        vpmadd52luq     ymm22,ymm4,YMMWORD[256+rcx]
        vpmadd52luq     ymm23,ymm4,YMMWORD[288+rcx]


        valignq ymm2,ymm20,ymm2,1
        valignq ymm20,ymm21,ymm20,1
        valignq ymm21,ymm22,ymm21,1
        valignq ymm22,ymm23,ymm22,1
        valignq ymm23,ymm0,ymm23,1

        vmovq   r13,xmm2
        add     r15,r13

        vpmadd52huq     ymm2,ymm3,YMMWORD[160+rsi]
        vpmadd52huq     ymm20,ymm3,YMMWORD[192+rsi]
        vpmadd52huq     ymm21,ymm3,YMMWORD[224+rsi]
        vpmadd52huq     ymm22,ymm3,YMMWORD[256+rsi]
        vpmadd52huq     ymm23,ymm3,YMMWORD[288+rsi]

        vpmadd52huq     ymm2,ymm4,YMMWORD[160+rcx]
        vpmadd52huq     ymm20,ymm4,YMMWORD[192+rcx]
        vpmadd52huq     ymm21,ymm4,YMMWORD[224+rcx]
        vpmadd52huq     ymm22,ymm4,YMMWORD[256+rcx]
        vpmadd52huq     ymm23,ymm4,YMMWORD[288+rcx]
        lea     r11,[8+r11]
        dec     ebx
        jne     NEAR $L$loop20

        vmovdqa64       ymm4,YMMWORD[$L$mask52x4]

        vpbroadcastq    ymm3,r9
        vpblendd        ymm1,ymm1,ymm3,3



        vpsrlq  ymm24,ymm1,52
        vpsrlq  ymm25,ymm16,52
        vpsrlq  ymm26,ymm17,52
        vpsrlq  ymm27,ymm18,52
        vpsrlq  ymm28,ymm19,52


        valignq ymm28,ymm28,ymm27,3
        valignq ymm27,ymm27,ymm26,3
        valignq ymm26,ymm26,ymm25,3
        valignq ymm25,ymm25,ymm24,3
        valignq ymm24,ymm24,ymm0,3


        vpandq  ymm1,ymm1,ymm4
        vpandq  ymm16,ymm16,ymm4
        vpandq  ymm17,ymm17,ymm4
        vpandq  ymm18,ymm18,ymm4
        vpandq  ymm19,ymm19,ymm4


        vpaddq  ymm1,ymm1,ymm24
        vpaddq  ymm16,ymm16,ymm25
        vpaddq  ymm17,ymm17,ymm26
        vpaddq  ymm18,ymm18,ymm27
        vpaddq  ymm19,ymm19,ymm28



        vpcmpuq k1,ymm4,ymm1,1
        vpcmpuq k2,ymm4,ymm16,1
        vpcmpuq k3,ymm4,ymm17,1
        vpcmpuq k4,ymm4,ymm18,1
        vpcmpuq k5,ymm4,ymm19,1
        kmovb   r14d,k1
        kmovb   r13d,k2
        kmovb   r12d,k3
        kmovb   r11d,k4
        kmovb   r10d,k5


        vpcmpuq k1,ymm4,ymm1,0
        vpcmpuq k2,ymm4,ymm16,0
        vpcmpuq k3,ymm4,ymm17,0
        vpcmpuq k4,ymm4,ymm18,0
        vpcmpuq k5,ymm4,ymm19,0
        kmovb   r9d,k1
        kmovb   r8d,k2
        kmovb   ebx,k3
        kmovb   ecx,k4
        kmovb   edx,k5



        shl     r13b,4
        or      r14b,r13b
        shl     r11b,4
        or      r12b,r11b

        add     r14b,r14b
        adc     r12b,r12b
        adc     r10b,r10b

        shl     r8b,4
        or      r9b,r8b
        shl     cl,4
        or      bl,cl

        add     r14b,r9b
        adc     r12b,bl
        adc     r10b,dl

        xor     r14b,r9b
        xor     r12b,bl
        xor     r10b,dl

        kmovb   k1,r14d
        shr     r14b,4
        kmovb   k2,r14d
        kmovb   k3,r12d
        shr     r12b,4
        kmovb   k4,r12d
        kmovb   k5,r10d


        vpsubq  ymm1{k1},ymm1,ymm4
        vpsubq  ymm16{k2},ymm16,ymm4
        vpsubq  ymm17{k3},ymm17,ymm4
        vpsubq  ymm18{k4},ymm18,ymm4
        vpsubq  ymm19{k5},ymm19,ymm4

        vpandq  ymm1,ymm1,ymm4
        vpandq  ymm16,ymm16,ymm4
        vpandq  ymm17,ymm17,ymm4
        vpandq  ymm18,ymm18,ymm4
        vpandq  ymm19,ymm19,ymm4

        vpbroadcastq    ymm3,r15
        vpblendd        ymm2,ymm2,ymm3,3



        vpsrlq  ymm24,ymm2,52
        vpsrlq  ymm25,ymm20,52
        vpsrlq  ymm26,ymm21,52
        vpsrlq  ymm27,ymm22,52
        vpsrlq  ymm28,ymm23,52


        valignq ymm28,ymm28,ymm27,3
        valignq ymm27,ymm27,ymm26,3
        valignq ymm26,ymm26,ymm25,3
        valignq ymm25,ymm25,ymm24,3
        valignq ymm24,ymm24,ymm0,3


        vpandq  ymm2,ymm2,ymm4
        vpandq  ymm20,ymm20,ymm4
        vpandq  ymm21,ymm21,ymm4
        vpandq  ymm22,ymm22,ymm4
        vpandq  ymm23,ymm23,ymm4


        vpaddq  ymm2,ymm2,ymm24
        vpaddq  ymm20,ymm20,ymm25
        vpaddq  ymm21,ymm21,ymm26
        vpaddq  ymm22,ymm22,ymm27
        vpaddq  ymm23,ymm23,ymm28



        vpcmpuq k1,ymm4,ymm2,1
        vpcmpuq k2,ymm4,ymm20,1
        vpcmpuq k3,ymm4,ymm21,1
        vpcmpuq k4,ymm4,ymm22,1
        vpcmpuq k5,ymm4,ymm23,1
        kmovb   r14d,k1
        kmovb   r13d,k2
        kmovb   r12d,k3
        kmovb   r11d,k4
        kmovb   r10d,k5


        vpcmpuq k1,ymm4,ymm2,0
        vpcmpuq k2,ymm4,ymm20,0
        vpcmpuq k3,ymm4,ymm21,0
        vpcmpuq k4,ymm4,ymm22,0
        vpcmpuq k5,ymm4,ymm23,0
        kmovb   r9d,k1
        kmovb   r8d,k2
        kmovb   ebx,k3
        kmovb   ecx,k4
        kmovb   edx,k5



        shl     r13b,4
        or      r14b,r13b
        shl     r11b,4
        or      r12b,r11b

        add     r14b,r14b
        adc     r12b,r12b
        adc     r10b,r10b

        shl     r8b,4
        or      r9b,r8b
        shl     cl,4
        or      bl,cl

        add     r14b,r9b
        adc     r12b,bl
        adc     r10b,dl

        xor     r14b,r9b
        xor     r12b,bl
        xor     r10b,dl

        kmovb   k1,r14d
        shr     r14b,4
        kmovb   k2,r14d
        kmovb   k3,r12d
        shr     r12b,4
        kmovb   k4,r12d
        kmovb   k5,r10d


        vpsubq  ymm2{k1},ymm2,ymm4
        vpsubq  ymm20{k2},ymm20,ymm4
        vpsubq  ymm21{k3},ymm21,ymm4
        vpsubq  ymm22{k4},ymm22,ymm4
        vpsubq  ymm23{k5},ymm23,ymm4

        vpandq  ymm2,ymm2,ymm4
        vpandq  ymm20,ymm20,ymm4
        vpandq  ymm21,ymm21,ymm4
        vpandq  ymm22,ymm22,ymm4
        vpandq  ymm23,ymm23,ymm4

        vmovdqu64       YMMWORD[rdi],ymm1
        vmovdqu64       YMMWORD[32+rdi],ymm16
        vmovdqu64       YMMWORD[64+rdi],ymm17
        vmovdqu64       YMMWORD[96+rdi],ymm18
        vmovdqu64       YMMWORD[128+rdi],ymm19

        vmovdqu64       YMMWORD[160+rdi],ymm2
        vmovdqu64       YMMWORD[192+rdi],ymm20
        vmovdqu64       YMMWORD[224+rdi],ymm21
        vmovdqu64       YMMWORD[256+rdi],ymm22
        vmovdqu64       YMMWORD[288+rdi],ymm23

        vzeroupper
        mov     r15,QWORD[rsp]

        mov     r14,QWORD[8+rsp]

        mov     r13,QWORD[16+rsp]

        mov     r12,QWORD[24+rsp]

        mov     rbp,QWORD[32+rsp]

        mov     rbx,QWORD[40+rsp]

        lea     rsp,[48+rsp]

$L$rsaz_amm52x20_x2_256_epilogue:
        mov     rdi,QWORD[8+rsp]        ;WIN64 epilogue
        mov     rsi,QWORD[16+rsp]
        DB      0F3h,0C3h               ;repret

$L$SEH_end_ossl_rsaz_amm52x20_x2_256:
section .text code align=64


ALIGN   32
global  ossl_extract_multiplier_2x20_win5

ossl_extract_multiplier_2x20_win5:
        mov     QWORD[8+rsp],rdi        ;WIN64 prologue
        mov     QWORD[16+rsp],rsi
        mov     rax,rsp
$L$SEH_begin_ossl_extract_multiplier_2x20_win5:
        mov     rdi,rcx
        mov     rsi,rdx
        mov     rdx,r8
        mov     rcx,r9



DB      243,15,30,250
        lea     rax,[rcx*4+rcx]
        sal     rax,5
        add     rsi,rax

        vmovdqa64       ymm23,YMMWORD[$L$ones]
        vpbroadcastq    ymm22,rdx
        lea     rax,[10240+rsi]

        vpxor   xmm4,xmm4,xmm4
        vmovdqa64       ymm3,ymm4
        vmovdqa64       ymm2,ymm4
        vmovdqa64       ymm1,ymm4
        vmovdqa64       ymm0,ymm4
        vmovdqa64       ymm21,ymm4

ALIGN   32
$L$loop:
        vpcmpq  k1,ymm22,ymm21,0
        add     rsi,320
        vpaddq  ymm21,ymm21,ymm23
        vmovdqu64       ymm16,YMMWORD[((-320))+rsi]
        vmovdqu64       ymm17,YMMWORD[((-288))+rsi]
        vmovdqu64       ymm18,YMMWORD[((-256))+rsi]
        vmovdqu64       ymm19,YMMWORD[((-224))+rsi]
        vmovdqu64       ymm20,YMMWORD[((-192))+rsi]
        vpblendmq       ymm0{k1},ymm0,ymm16
        vpblendmq       ymm1{k1},ymm1,ymm17
        vpblendmq       ymm2{k1},ymm2,ymm18
        vpblendmq       ymm3{k1},ymm3,ymm19
        vpblendmq       ymm4{k1},ymm4,ymm20
        cmp     rax,rsi
        jne     NEAR $L$loop

        vmovdqu64       YMMWORD[rdi],ymm0
        vmovdqu64       YMMWORD[32+rdi],ymm1
        vmovdqu64       YMMWORD[64+rdi],ymm2
        vmovdqu64       YMMWORD[96+rdi],ymm3
        vmovdqu64       YMMWORD[128+rdi],ymm4

        mov     rdi,QWORD[8+rsp]        ;WIN64 epilogue
        mov     rsi,QWORD[16+rsp]
        DB      0F3h,0C3h               ;repret

$L$SEH_end_ossl_extract_multiplier_2x20_win5:
section .data data align=8

ALIGN   32
$L$ones:
        DQ      1,1,1,1
EXTERN  __imp_RtlVirtualUnwind

ALIGN   16
rsaz_def_handler:
        push    rsi
        push    rdi
        push    rbx
        push    rbp
        push    r12
        push    r13
        push    r14
        push    r15
        pushfq
        sub     rsp,64

        mov     rax,QWORD[120+r8]
        mov     rbx,QWORD[248+r8]

        mov     rsi,QWORD[8+r9]
        mov     r11,QWORD[56+r9]

        mov     r10d,DWORD[r11]
        lea     r10,[r10*1+rsi]
        cmp     rbx,r10
        jb      NEAR $L$common_seh_tail

        mov     rax,QWORD[152+r8]

        mov     r10d,DWORD[4+r11]
        lea     r10,[r10*1+rsi]
        cmp     rbx,r10
        jae     NEAR $L$common_seh_tail

        lea     rax,[48+rax]

        mov     rbx,QWORD[((-8))+rax]
        mov     rbp,QWORD[((-16))+rax]
        mov     r12,QWORD[((-24))+rax]
        mov     r13,QWORD[((-32))+rax]
        mov     r14,QWORD[((-40))+rax]
        mov     r15,QWORD[((-48))+rax]
        mov     QWORD[144+r8],rbx
        mov     QWORD[160+r8],rbp
        mov     QWORD[216+r8],r12
        mov     QWORD[224+r8],r13
        mov     QWORD[232+r8],r14
        mov     QWORD[240+r8],r15

$L$common_seh_tail:
        mov     rdi,QWORD[8+rax]
        mov     rsi,QWORD[16+rax]
        mov     QWORD[152+r8],rax
        mov     QWORD[168+r8],rsi
        mov     QWORD[176+r8],rdi

        mov     rdi,QWORD[40+r9]
        mov     rsi,r8
        mov     ecx,154
        DD      0xa548f3fc

        mov     rsi,r9
        xor     rcx,rcx
        mov     rdx,QWORD[8+rsi]
        mov     r8,QWORD[rsi]
        mov     r9,QWORD[16+rsi]
        mov     r10,QWORD[40+rsi]
        lea     r11,[56+rsi]
        lea     r12,[24+rsi]
        mov     QWORD[32+rsp],r10
        mov     QWORD[40+rsp],r11
        mov     QWORD[48+rsp],r12
        mov     QWORD[56+rsp],rcx
        call    QWORD[__imp_RtlVirtualUnwind]

        mov     eax,1
        add     rsp,64
        popfq
        pop     r15
        pop     r14
        pop     r13
        pop     r12
        pop     rbp
        pop     rbx
        pop     rdi
        pop     rsi
        DB      0F3h,0C3h               ;repret


section .pdata rdata align=4
ALIGN   4
        DD      $L$SEH_begin_ossl_rsaz_amm52x20_x1_256 wrt ..imagebase
        DD      $L$SEH_end_ossl_rsaz_amm52x20_x1_256 wrt ..imagebase
        DD      $L$SEH_info_ossl_rsaz_amm52x20_x1_256 wrt ..imagebase

        DD      $L$SEH_begin_ossl_rsaz_amm52x20_x2_256 wrt ..imagebase
        DD      $L$SEH_end_ossl_rsaz_amm52x20_x2_256 wrt ..imagebase
        DD      $L$SEH_info_ossl_rsaz_amm52x20_x2_256 wrt ..imagebase

        DD      $L$SEH_begin_ossl_extract_multiplier_2x20_win5 wrt ..imagebase
        DD      $L$SEH_end_ossl_extract_multiplier_2x20_win5 wrt ..imagebase
        DD      $L$SEH_info_ossl_extract_multiplier_2x20_win5 wrt ..imagebase

section .xdata rdata align=8
ALIGN   8
$L$SEH_info_ossl_rsaz_amm52x20_x1_256:
DB      9,0,0,0
        DD      rsaz_def_handler wrt ..imagebase
        DD      $L$rsaz_amm52x20_x1_256_body wrt ..imagebase,$L$rsaz_amm52x20_x1_256_epilogue wrt ..imagebase
$L$SEH_info_ossl_rsaz_amm52x20_x2_256:
DB      9,0,0,0
        DD      rsaz_def_handler wrt ..imagebase
        DD      $L$rsaz_amm52x20_x2_256_body wrt ..imagebase,$L$rsaz_amm52x20_x2_256_epilogue wrt ..imagebase
$L$SEH_info_ossl_extract_multiplier_2x20_win5:
DB      9,0,0,0
        DD      rsaz_def_handler wrt ..imagebase
        DD      $L$SEH_begin_ossl_extract_multiplier_2x20_win5 wrt ..imagebase,$L$SEH_begin_ossl_extract_multiplier_2x20_win5 wrt ..imagebase
